Let' import the needed modules and create a function
from geowombat.ml import fit
import numpy as np
from sklearn.pipeline import Pipeline
from geowombat.ml import fit_predict
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import geowombat as gw
import geopandas as gpd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import balanced_accuracy_score
# read in training data
training = gpd.read_file(r"./data/training.geojson")
training.head()
def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
ax = ax or plt.gca()
# Plot the training points
ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
clim=(y.min(), y.max()), zorder=3)
ax.axis('tight')
# ax.axis('off')
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# fit the estimator
model.fit(X, y)
xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
np.linspace(*ylim, num=200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
# Create a color plot with the results
n_classes = len(np.unique(y))
contours = ax.contourf(xx, yy, Z, alpha=0.3,
levels=np.arange(n_classes + 1) - 0.5,
cmap=cmap,
zorder=1)
# Label the contour lines
contour_lines = ax.contour(xx, yy, Z, colors='k', linewidths=0.5)
# ax.clabel(contour_lines, inline=True, fontsize=8)
# Add x and y axis labels
ax.set_xlabel('EVI doy 181')
ax.set_ylabel('EVI doy 90')
ax.set(xlim=xlim, ylim=ylim)
plt.show()
First let's do a simple unsupervised classifcation using two images of Enhanced Vegetation Index values (EVI) for two days in 2023, the first image from day of the year (doy) 90 and the second from doy 181.
To start let's look at the images. High values imply high EVI values - indicating healthier or more vigourous vegetation. There are also some 0 values, which is a cloud in doy 181
# plot images and training data
fig, (ax1, ax2) = plt.subplots(dpi=200, figsize=(10, 5),ncols=2, sharey=True)
# open evi image and plot
with gw.open(
["./data/TZ_EVI_2023_90.tif","./data/TZ_EVI_2023_181.tif"],
band_names=["evi doy 90","evi doy 181"],
nodata=0,
stack_dim="band",
) as src:
# plot image
src.sel(band='evi doy 90').plot(robust=True, ax=ax1)
src.sel(band='evi doy 181').plot(robust=True, ax=ax2)
# plot points
plt.tight_layout(pad=1)
Ok now let's train, fit and predict from a model. Here we are going to create a kmeans unsupervised classifier to generate 5 different clusters of data, which hopefully correspond to our crop types.
# set plot size
fig, ax = plt.subplots(dpi=200, figsize=(5, 5))
# set up model pipeline
km = Pipeline([("clf", KMeans(n_clusters=5, random_state=0))])
# open images, fit model and plot
with gw.open(
[r"./data/TZ_EVI_2023_90.tif", r"./data/TZ_EVI_2023_181.tif"],
stack_dim="band",
nodata=0,
) as src:
#fit unsupervised model
y = fit_predict(src, km)
y.plot(robust=True, ax=ax)
plt.tight_layout(pad=1)
# save image
y.gw.to_raster(r"./data/predictions_kmeans5.tif", overwrite=True)
100%|██████████| 6/6 [00:00<00:00, 413.29it/s]
After a bit of code gymnastics, let's visualize the classification rule
cl = Pipeline([ ("clf", GaussianNB())])
with gw.open(
[r"./data/TZ_EVI_2023_90.tif", r"./data/TZ_EVI_2023_181.tif"],
stack_dim="band",
nodata=0,
) as src:
X, Xy, pipe = fit(src, cl, training, col="lc_code")
visualize_classifier(km, Xy[0].values , Xy[1].values )
Next we will utilize the data that you collected (location, croptype) to train a supervised classification model.
Lets start by plotting out two image from 2023 day 90 and 181, overlay training data with land cover classes.
# read in training data
training = gpd.read_file(r"./data/training.geojson")
training.head()
# plot images and training data
fig, (ax1, ax2) = plt.subplots(dpi=200, figsize=(10, 5),ncols=2, sharey=True)
# open images
with gw.open(
["./data/TZ_EVI_2023_90.tif","./data/TZ_EVI_2023_181.tif"],
band_names=["evi doy 90","evi doy 181"],
nodata=0,
stack_dim="band",
) as src:
# plot images
src.sel(band='evi doy 90').plot(robust=True, ax=ax1)
training.plot(ax=ax1, column="lc_code", cmap="tab20", legend=False)
src.sel(band='evi doy 181').plot(robust=True, ax=ax2)
training.plot(ax=ax2, column="lc_code", cmap="tab20", legend=False)
# plot points
plt.tight_layout(pad=1)
We can also take a look at the crop type "class labels" that we have for this particular location. Here are all the crop types:
# First, we need to know what the codes mean.
lc_codes = training[['lc_code','Primary land cover']].sort_values(['lc_code']).drop_duplicates()
lc_codes
| lc_code | Primary land cover | |
|---|---|---|
| 9 | 0 | Millet (Ulezi)* |
| 19 | 1 | Rice (Mpunga)* |
| 0 | 2 | Sorghum (Mtama)* |
| 11 | 3 | Sunflower (Alizeti)* |
Let's look at the pixel values for a few of the places we have collected crop type data. Remember high EVI means healthy vegetation was there at that particular time period.
with gw.open(
[r"./data/TZ_EVI_2023_90.tif", r"./data/TZ_EVI_2023_181.tif"],
stack_dim="band",
nodata=0, band_names=['EVI-90','EVI-181']
) as src:
# Extract the training data
data = gw.extract(src, training)
# Rename the columns
data.columns = ['Primary land cover','lc_code','geometry','id','EVI-90','EVI-181',]
# Display the data
display(data[['geometry','EVI-90','EVI-181','lc_code','Primary land cover']])
| geometry | EVI-90 | EVI-181 | lc_code | Primary land cover | |
|---|---|---|---|---|---|
| 0 | POINT (3996832.903 -742427.108) | 1654 | 2053 | 2 | Sorghum (Mtama)* |
| 1 | POINT (3997138.651 -741757.314) | 1140 | 1649 | 3 | Sunflower (Alizeti)* |
| 2 | POINT (3997317.689 -741547.149) | 1224 | 1845 | 3 | Sunflower (Alizeti)* |
| 3 | POINT (3997175.563 -741215.686) | 1884 | 3157 | 3 | Sunflower (Alizeti)* |
| 4 | POINT (3997777.393 -740791.102) | 854 | 1465 | 2 | Sorghum (Mtama)* |
| 5 | POINT (3997889.361 -740640.430) | 903 | 2715 | 3 | Sunflower (Alizeti)* |
| 6 | POINT (3997094.237 -740171.021) | 4700 | 1767 | 2 | Sorghum (Mtama)* |
| 7 | POINT (3997370.015 -740041.437) | 5283 | 1939 | 3 | Sunflower (Alizeti)* |
| 8 | POINT (3998498.357 -739940.516) | 1183 | 2025 | 2 | Sorghum (Mtama)* |
| 9 | POINT (3998361.830 -739940.068) | 1113 | 1715 | 0 | Millet (Ulezi)* |
| 10 | POINT (3997493.143 -739930.252) | 2650 | 2478 | 2 | Sorghum (Mtama)* |
| 11 | POINT (3998478.837 -739840.230) | 2298 | 1419 | 3 | Sunflower (Alizeti)* |
| 12 | POINT (3997990.730 -739560.309) | 1430 | 3592 | 2 | Sorghum (Mtama)* |
| 13 | POINT (3998726.847 -739272.438) | 2410 | 3342 | 2 | Sorghum (Mtama)* |
| 14 | POINT (3996786.362 -739202.682) | 4292 | 2766 | 3 | Sunflower (Alizeti)* |
| 15 | POINT (3998342.161 -739120.684) | 1857 | 4021 | 2 | Sorghum (Mtama)* |
| 16 | POINT (3997059.714 -739095.305) | 2703 | 3401 | 2 | Sorghum (Mtama)* |
| 17 | POINT (3998697.511 -739050.741) | 3841 | 3623 | 1 | Rice (Mpunga)* |
| 18 | POINT (3996646.924 -738942.169) | 4062 | 4145 | 1 | Rice (Mpunga)* |
| 19 | POINT (3996783.898 -738897.867) | 889 | 3403 | 1 | Rice (Mpunga)* |
| 20 | POINT (3998785.742 -738827.439) | 2823 | 4300 | 2 | Sorghum (Mtama)* |
| 21 | POINT (3997185.790 -738819.228) | 2044 | 3674 | 2 | Sorghum (Mtama)* |
Now let's use that ground truth data to train, fit and predict a supervised classification model, in this case caleed GaussianNB which isn't totally dissimilar to maximum likelihood.
# Create the classifier
cl = Pipeline([ ("clf", GaussianNB())])
# Create the figure
fig, ax = plt.subplots(dpi=200, figsize=(5, 5))
# Fit the classifier and predict
with gw.open(
[r"./data/TZ_EVI_2023_90.tif", r"./data/TZ_EVI_2023_181.tif"],
stack_dim="band",
nodata=0,
) as src:
y = fit_predict(src, cl, training, col="lc_code")
y.plot(robust=True, ax=ax)
# Plot the training data
training.plot(ax=ax, column="lc_code", cmap="tab20", legend=False,s=6 )
# Save the prediction
plt.tight_layout(pad=1)
y.gw.save(r"./data/predictions_gaussian.tif")
12:46:22:WARNING:669:io.save:The file ./data/predictions_gaussian.tif already exists.
Let's visualize the classification rule:
visualize_classifier(cl, Xy[0].values , Xy[1].values )
For comparison, let's train another model using RandomForestClassifier:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline([("clf", RandomForestClassifier(n_estimators=500, random_state=0))])
fig, ax = plt.subplots(dpi=200, figsize=(5, 5))
with gw.open(
[r"./data/TZ_EVI_2023_90.tif", r"./data/TZ_EVI_2023_181.tif"],
stack_dim="band",
nodata=0,
) as src:
y = fit_predict(src, rf, training, col="lc_code")
y.plot(robust=True, ax=ax)
training.plot(ax=ax, column="lc_code", cmap="tab20", legend=False,s=6 )
plt.tight_layout(pad=1)
y.gw.save(r"./data/predictions_RF.tif")
12:46:25:WARNING:669:io.save:The file ./data/predictions_RF.tif already exists.
Let's visualize the classification rule:
visualize_classifier(rf, Xy[0].values , Xy[1].values )
Let's compare our predictions (on the raster) to the data you collected.
To start let's get our predicted labels using GausianNB for each location:
with gw.open("./data/predictions_gaussian.tif") as src:
df = src.gw.extract(training,band_names=['prediction'])
df.head()
| Primary land cover | lc_code | geometry | id | prediction | |
|---|---|---|---|---|---|
| 0 | Sorghum (Mtama)* | 3 | POINT (3996832.903 -742427.108) | 0 | 3.0 |
| 1 | Sunflower (Alizeti)* | 4 | POINT (3997138.651 -741757.314) | 1 | 4.0 |
| 2 | Sunflower (Alizeti)* | 4 | POINT (3997317.689 -741547.149) | 2 | 4.0 |
| 3 | Sunflower (Alizeti)* | 4 | POINT (3997175.563 -741215.686) | 3 | 3.0 |
| 4 | Sorghum (Mtama)* | 3 | POINT (3997777.393 -740791.102) | 4 | 4.0 |
from sklearn.metrics import confusion_matrix
import seaborn as sns
mat = confusion_matrix(y_true=df['lc_code'], y_pred=df['prediction'])
sns.heatmap(mat.T, square=True, annot=True, cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')
Text(113.9222222222222, 0.5, 'predicted label')
# crop codes
lc_codes
| lc_code | Primary land cover | |
|---|---|---|
| 9 | 0 | Millet (Ulezi)* |
| 19 | 1 | Rice (Mpunga)* |
| 0 | 2 | Sorghum (Mtama)* |
| 11 | 3 | Sunflower (Alizeti)* |
score = balanced_accuracy_score(y_true=df['lc_code'], y_pred=df['prediction'])
# Print the balanced accuracy score as a percentage
print(f'Balanced Accuracy Score: {score:.2%}')
Balanced Accuracy Score: 74.13%